{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "*** Introductory Examples for the NLTK Book ***\n",
      "Loading text1, ..., text9 and sent1, ..., sent9\n",
      "Type the name of the text or sentence to view it.\n",
      "Type: 'texts()' or 'sents()' to list the materials.\n",
      "text1: Moby Dick by Herman Melville 1851\n",
      "text2: Sense and Sensibility by Jane Austen 1811\n",
      "text3: The Book of Genesis\n",
      "text4: Inaugural Address Corpus\n",
      "text5: Chat Corpus\n",
      "text6: Monty Python and the Holy Grail\n",
      "text7: Wall Street Journal\n",
      "text8: Personals Corpus\n",
      "text9: The Man Who Was Thursday by G . K . Chesterton 1908\n"
     ]
    }
   ],
   "source": [
    "import nltk\n",
    "from nltk.book import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<Text: Moby Dick by Herman Melville 1851>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "sent1: Call me Ishmael .\n",
      "sent2: The family of Dashwood had long been settled in Sussex .\n",
      "sent3: In the beginning God created the heaven and the earth .\n",
      "sent4: Fellow - Citizens of the Senate and of the House of Representatives :\n",
      "sent5: I have a problem with people PMing me to lol JOIN\n",
      "sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !\n",
      "sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .\n",
      "sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .\n",
      "sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .\n"
     ]
    }
   ],
   "source": [
    "sents()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Call', 'me', 'Ishmael', '.']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sent1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<Text: Wall Street Journal> 100676\n"
     ]
    }
   ],
   "source": [
    "print(text7, len(text7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.'] 18\n"
     ]
    }
   ],
   "source": [
    "print(sent7, len(sent7))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Foster',\n",
       " 'tallies',\n",
       " 'rejected',\n",
       " 'budding',\n",
       " 'Ratings',\n",
       " 'earns',\n",
       " 'Raton',\n",
       " '8.70',\n",
       " 'Carnival',\n",
       " 'Driscoll']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(text7))[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "12408"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Frequency of words\n",
    "dist = FreqDist(text7)\n",
    "len(dist)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vocab1 = list(dist.keys())\n",
    "vocab1[:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "dist['Vinken']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['billion',\n",
       " 'company',\n",
       " 'president',\n",
       " 'because',\n",
       " 'market',\n",
       " 'million',\n",
       " 'shares',\n",
       " 'trading',\n",
       " 'program']"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "freqwords = [w for w in vocab1 if len(w) > 5 and dist[w] > 100]\n",
    "freqwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['list', 'listed', 'lists', 'listing', 'listings']"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# different forms of the same \"word\"\n",
    "input1 = 'List listed lists listing listings'\n",
    "words1 = input1.lower().split(' ')\n",
    "words1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['list', 'list', 'list', 'list', 'list']"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "porter = nltk.PorterStemmer()\n",
    "[porter.stem(t) for t in words1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Children', \"shouldn't\", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tokenization\n",
    "text11 = \"Children shouldn't drink a sugary drink before bed.\"\n",
    "text11.split(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Children',\n",
       " 'should',\n",
       " \"n't\",\n",
       " 'drink',\n",
       " 'a',\n",
       " 'sugary',\n",
       " 'drink',\n",
       " 'before',\n",
       " 'bed',\n",
       " '.']"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "nltk.word_tokenize(text11)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 36,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# sentence splitting\n",
    "text12 = 'This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!'\n",
    "sentences = nltk.sent_tokenize(text12)\n",
    "len(sentences)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['This is the first sentence.',\n",
       " 'A gallon of milk in the U.S. costs $2.99.',\n",
       " 'Is this the third sentence?',\n",
       " 'Yes, it is!']"
      ]
     },
     "execution_count": 37,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sentences"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
